{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pima Indians Diabetes Data Set数据探索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants</th>\n",
       "      <th>Plasma_glucose_concentration</th>\n",
       "      <th>blood_pressure</th>\n",
       "      <th>Triceps_skin_fold_thickness</th>\n",
       "      <th>serum_insulin</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Diabetes_pedigree_function</th>\n",
       "      <th>Age</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6</td>\n",
       "      <td>148</td>\n",
       "      <td>72</td>\n",
       "      <td>35</td>\n",
       "      <td>0</td>\n",
       "      <td>33.6</td>\n",
       "      <td>0.627</td>\n",
       "      <td>50</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>85</td>\n",
       "      <td>66</td>\n",
       "      <td>29</td>\n",
       "      <td>0</td>\n",
       "      <td>26.6</td>\n",
       "      <td>0.351</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>183</td>\n",
       "      <td>64</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>23.3</td>\n",
       "      <td>0.672</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>89</td>\n",
       "      <td>66</td>\n",
       "      <td>23</td>\n",
       "      <td>94</td>\n",
       "      <td>28.1</td>\n",
       "      <td>0.167</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>137</td>\n",
       "      <td>40</td>\n",
       "      <td>35</td>\n",
       "      <td>168</td>\n",
       "      <td>43.1</td>\n",
       "      <td>2.288</td>\n",
       "      <td>33</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pregnants  Plasma_glucose_concentration  blood_pressure  \\\n",
       "0          6                           148              72   \n",
       "1          1                            85              66   \n",
       "2          8                           183              64   \n",
       "3          1                            89              66   \n",
       "4          0                           137              40   \n",
       "\n",
       "   Triceps_skin_fold_thickness  serum_insulin   BMI  \\\n",
       "0                           35              0  33.6   \n",
       "1                           29              0  26.6   \n",
       "2                            0              0  23.3   \n",
       "3                           23             94  28.1   \n",
       "4                           35            168  43.1   \n",
       "\n",
       "   Diabetes_pedigree_function  Age  Target  \n",
       "0                       0.627   50       1  \n",
       "1                       0.351   31       0  \n",
       "2                       0.672   32       1  \n",
       "3                       0.167   21       0  \n",
       "4                       2.288   33       1  "
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = pd.read_csv(\"data/pima-indians-diabetes.csv\")\n",
    "X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants</th>\n",
       "      <th>Plasma_glucose_concentration</th>\n",
       "      <th>blood_pressure</th>\n",
       "      <th>Triceps_skin_fold_thickness</th>\n",
       "      <th>serum_insulin</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Diabetes_pedigree_function</th>\n",
       "      <th>Age</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>3.845052</td>\n",
       "      <td>120.894531</td>\n",
       "      <td>69.105469</td>\n",
       "      <td>20.536458</td>\n",
       "      <td>79.799479</td>\n",
       "      <td>31.992578</td>\n",
       "      <td>0.471876</td>\n",
       "      <td>33.240885</td>\n",
       "      <td>0.348958</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>3.369578</td>\n",
       "      <td>31.972618</td>\n",
       "      <td>19.355807</td>\n",
       "      <td>15.952218</td>\n",
       "      <td>115.244002</td>\n",
       "      <td>7.884160</td>\n",
       "      <td>0.331329</td>\n",
       "      <td>11.760232</td>\n",
       "      <td>0.476951</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.078000</td>\n",
       "      <td>21.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>99.000000</td>\n",
       "      <td>62.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>27.300000</td>\n",
       "      <td>0.243750</td>\n",
       "      <td>24.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>3.000000</td>\n",
       "      <td>117.000000</td>\n",
       "      <td>72.000000</td>\n",
       "      <td>23.000000</td>\n",
       "      <td>30.500000</td>\n",
       "      <td>32.000000</td>\n",
       "      <td>0.372500</td>\n",
       "      <td>29.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>6.000000</td>\n",
       "      <td>140.250000</td>\n",
       "      <td>80.000000</td>\n",
       "      <td>32.000000</td>\n",
       "      <td>127.250000</td>\n",
       "      <td>36.600000</td>\n",
       "      <td>0.626250</td>\n",
       "      <td>41.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>17.000000</td>\n",
       "      <td>199.000000</td>\n",
       "      <td>122.000000</td>\n",
       "      <td>99.000000</td>\n",
       "      <td>846.000000</td>\n",
       "      <td>67.100000</td>\n",
       "      <td>2.420000</td>\n",
       "      <td>81.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        pregnants  Plasma_glucose_concentration  blood_pressure  \\\n",
       "count  768.000000                    768.000000      768.000000   \n",
       "mean     3.845052                    120.894531       69.105469   \n",
       "std      3.369578                     31.972618       19.355807   \n",
       "min      0.000000                      0.000000        0.000000   \n",
       "25%      1.000000                     99.000000       62.000000   \n",
       "50%      3.000000                    117.000000       72.000000   \n",
       "75%      6.000000                    140.250000       80.000000   \n",
       "max     17.000000                    199.000000      122.000000   \n",
       "\n",
       "       Triceps_skin_fold_thickness  serum_insulin         BMI  \\\n",
       "count                   768.000000     768.000000  768.000000   \n",
       "mean                     20.536458      79.799479   31.992578   \n",
       "std                      15.952218     115.244002    7.884160   \n",
       "min                       0.000000       0.000000    0.000000   \n",
       "25%                       0.000000       0.000000   27.300000   \n",
       "50%                      23.000000      30.500000   32.000000   \n",
       "75%                      32.000000     127.250000   36.600000   \n",
       "max                      99.000000     846.000000   67.100000   \n",
       "\n",
       "       Diabetes_pedigree_function         Age      Target  \n",
       "count                  768.000000  768.000000  768.000000  \n",
       "mean                     0.471876   33.240885    0.348958  \n",
       "std                      0.331329   11.760232    0.476951  \n",
       "min                      0.078000   21.000000    0.000000  \n",
       "25%                      0.243750   24.000000    0.000000  \n",
       "50%                      0.372500   29.000000    0.000000  \n",
       "75%                      0.626250   41.000000    1.000000  \n",
       "max                      2.420000   81.000000    1.000000  "
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pregnants                         0\n",
       "Plasma_glucose_concentration      5\n",
       "blood_pressure                   35\n",
       "Triceps_skin_fold_thickness     227\n",
       "serum_insulin                   374\n",
       "BMI                              11\n",
       "Diabetes_pedigree_function        0\n",
       "Age                               0\n",
       "Target                            0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## 将0值转换成NAN值\n",
    "Nan_column = ['Plasma_glucose_concentration','blood_pressure','Triceps_skin_fold_thickness','serum_insulin','BMI']\n",
    "X[Nan_column] = X[Nan_column].replace(0,np.NaN)\n",
    "X.isnull().sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 对于特征值确实较多的列，新增一列表示该特征值是否缺失"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\jack-pc\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages\\ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  \n",
      "c:\\users\\jack-pc\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages\\ipykernel_launcher.py:3: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "X['Triceps_skin_fold_thickness_Missing'] = X['Triceps_skin_fold_thickness']\n",
    "Triceps_skin_fold_thickness_Missing = X['Triceps_skin_fold_thickness_Missing']\n",
    "Triceps_skin_fold_thickness_Missing[Triceps_skin_fold_thickness_Missing.notnull()] = 1\n",
    "Triceps_skin_fold_thickness_Missing[Triceps_skin_fold_thickness_Missing.isnull()] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\jack-pc\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages\\ipykernel_launcher.py:3: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n",
      "c:\\users\\jack-pc\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages\\ipykernel_launcher.py:4: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  after removing the cwd from sys.path.\n"
     ]
    }
   ],
   "source": [
    "X['serum_insulin_Missing'] = X['serum_insulin']\n",
    "serum_insulin_Missing = X['serum_insulin_Missing']\n",
    "serum_insulin_Missing[serum_insulin_Missing.notnull()] = 1\n",
    "serum_insulin_Missing[serum_insulin_Missing.isnull()] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants</th>\n",
       "      <th>Plasma_glucose_concentration</th>\n",
       "      <th>blood_pressure</th>\n",
       "      <th>Triceps_skin_fold_thickness</th>\n",
       "      <th>serum_insulin</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Diabetes_pedigree_function</th>\n",
       "      <th>Age</th>\n",
       "      <th>Target</th>\n",
       "      <th>Triceps_skin_fold_thickness_Missing</th>\n",
       "      <th>serum_insulin_Missing</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6</td>\n",
       "      <td>148.0</td>\n",
       "      <td>72.0</td>\n",
       "      <td>35.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>33.6</td>\n",
       "      <td>0.627</td>\n",
       "      <td>50</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>85.0</td>\n",
       "      <td>66.0</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>26.6</td>\n",
       "      <td>0.351</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>183.0</td>\n",
       "      <td>64.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>23.3</td>\n",
       "      <td>0.672</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>89.0</td>\n",
       "      <td>66.0</td>\n",
       "      <td>23.0</td>\n",
       "      <td>94.0</td>\n",
       "      <td>28.1</td>\n",
       "      <td>0.167</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>137.0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>35.0</td>\n",
       "      <td>168.0</td>\n",
       "      <td>43.1</td>\n",
       "      <td>2.288</td>\n",
       "      <td>33</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pregnants  Plasma_glucose_concentration  blood_pressure  \\\n",
       "0          6                         148.0            72.0   \n",
       "1          1                          85.0            66.0   \n",
       "2          8                         183.0            64.0   \n",
       "3          1                          89.0            66.0   \n",
       "4          0                         137.0            40.0   \n",
       "\n",
       "   Triceps_skin_fold_thickness  serum_insulin   BMI  \\\n",
       "0                         35.0            NaN  33.6   \n",
       "1                         29.0            NaN  26.6   \n",
       "2                          NaN            NaN  23.3   \n",
       "3                         23.0           94.0  28.1   \n",
       "4                         35.0          168.0  43.1   \n",
       "\n",
       "   Diabetes_pedigree_function  Age  Target  \\\n",
       "0                       0.627   50       1   \n",
       "1                       0.351   31       0   \n",
       "2                       0.672   32       1   \n",
       "3                       0.167   21       0   \n",
       "4                       2.288   33       1   \n",
       "\n",
       "   Triceps_skin_fold_thickness_Missing  serum_insulin_Missing  \n",
       "0                                  1.0                    0.0  \n",
       "1                                  1.0                    0.0  \n",
       "2                                  0.0                    0.0  \n",
       "3                                  1.0                    1.0  \n",
       "4                                  1.0                    1.0  "
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 分析缺失值和目标值的关系"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAELCAYAAADDZxFQAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAGydJREFUeJzt3XuQFeWd//H3Ry6CEUVgdJEBIYpuIC6DjmguJt5+P5BsIiZqwSbrPZgqctskbEwqpUi0Kskm0bgmuBiNmBjUNYmi6yXeWbNRREQEXEpM8pMRFERldRGVyff3Rz8jx/GZ4QDTcwbm86o6Naeffrr7ew7D+czT3adbEYGZmVlru9W6ADMz65ocEGZmluWAMDOzLAeEmZllOSDMzCzLAWFmZlkOCDMzy3JAmJlZlgPCzMyyeta6gB0xaNCgGD58eK3LMDPbqTz++OMvRUTd1vrt1AExfPhwFi5cWOsyzMx2KpL+XzX9vIvJzMyyHBBmZpblgDAzs6yd+hiEmVmtvP322zQ1NbFp06Zal9KmPn36UF9fT69evbZreQeEmdl2aGpqol+/fgwfPhxJtS7nPSKC9evX09TUxIgRI7ZrHd7FZGa2HTZt2sTAgQO7ZDgASGLgwIE7NMJxQJiZbaeuGg4tdrQ+B4SZmWX5GISZWQdYv349xx9/PAAvvPACPXr0oK6u+LLyggUL6N27d4dvc9GiRaxdu5YJEyZ0+LrBAWHWZR0+/bpal9BlPP4vp9e6hK0aOHAgixcvBmDGjBnsueeefOMb36h6+ebmZnr06LFN21y0aBFLly4tLSC8i8nMrGSf/OQnOfzwwxk9ejQ///nPAdi8eTP9+/fnO9/5DuPGjWPBggXMmzePQw45hKOPPpovfelLTJo0CYDXX3+dM888k3HjxjF27Fhuu+023njjDWbOnMn1119PQ0MDN998c4fXXdoIQlIfYD6we9rOzRFxoaRrgY8DG1LXMyNisYqjKT8BJgIbU/uisuozM+ssc+bMYcCAAWzcuJHGxkY+85nP0K9fPzZs2MBhhx3GxRdfzMaNGzn44IP5wx/+wLBhwzjttNPeWX7mzJlMmDCBa6+9lldeeYUjjzySJUuWcMEFF7B06VIuu+yyUuoucwTxJnBcRIwBGoAJko5K86ZHREN6LE5tJwIj02MqMKvE2szMOs2ll17KmDFj+NCHPkRTUxPPPvssAL179+bkk08GYPny5RxyyCEccMABSGLKlCnvLP/73/+eSy65hIaGBo499lg2bdrEc889V3rdpY0gIiKA19Nkr/SIdhY5CbguLfeIpP6SBkfEmrJqNDMr27333sv8+fN55JFH6Nu3Lx/96Eff+W5C37593zkVtfjoy4sIbrnlFg488MB3tc+fP7+8win5GISkHpIWA2uBeyLi0TTrEklLJF0qaffUNgRYVbF4U2ozM9tpbdiwgQEDBtC3b1+WLVvGY489lu03evRoVqxYwapVq4gIbrzxxnfmjR8/nssvv/yd6SeeeAKAfv368dprr5VWe6kBERHNEdEA1APjJH0Q+Bbwt8ARwADgm6l77hsd74lUSVMlLZS0cN26dSVVbmbWMT7xiU+wceNGxowZw8yZMznyyCOz/fbYYw+uuOIKTjjhBI4++mj2339/9t57bwAuvPBCNm7cyKGHHsro0aOZMWMGAMcddxxPPvkkY8eO3bkOUleKiFclPQhMiIgfpuY3Jf0CaDkPrAkYWrFYPbA6s67ZwGyAxsbG9nZZmZnVRMsHOBQXzLv77ruz/V599dV3TZ9wwgmsWLGCiOC8886jsbERgPe9731cddVV71m+rq6u1JumlTaCkFQnqX963hc4AfhvSYNTm4BJwNK0yDzgdBWOAjb4+IOZdSezZs2ioaGBUaNG8cYbb/D5z3++pvWUOYIYDMyR1IMiiG6KiNsl3S+pjmKX0mLgC6n/HRSnuK6kOM31rBJrMzPrcqZPn8706dNrXcY7yjyLaQkwNtN+XBv9A5hWVj1mZrZt/E1qMzPLckCYmVmWA8LMzLJ8NVczsw7Q0VffreYKtnfddRdf+cpXaG5u5txzz+X888/v0Bo8gjAz2wk1Nzczbdo07rzzTpYvX87cuXNZvnx5h27DAWFmthNasGABBx10EO9///vp3bs3kydP5tZbb+3QbTggzMx2Qs8//zxDh265+ER9fT3PP/98h27DAWFmthPKXf215cqwHcUBYWa2E6qvr2fVqi0XwG5qamL//ffv0G04IMzMdkJHHHEEzzzzDH/+85956623uOGGG/jUpz7Vodvwaa5mZh2gmtNSO1LPnj254oorGD9+PM3NzZx99tmMHj26Y7fRoWszM7NOM3HiRCZOnFja+r2LyczMshwQZmaW5YAwM7MsB4SZmWU5IMzMLMsBYWZmWT7N1cysAzw389AOXd+wC57aap+zzz6b22+/nX333ZelS5d26PbBIwgzs53WmWeeyV133VXa+ksLCEl9JC2Q9KSkZZIuSu0jJD0q6RlJN0rqndp3T9Mr0/zhZdVmZrYr+NjHPsaAAQNKW3+ZI4g3geMiYgzQAEyQdBTwfeDSiBgJvAKck/qfA7wSEQcBl6Z+ZmZWI6UFRBReT5O90iOA44CbU/scYFJ6flKaJs0/Xh197VozM6taqccgJPWQtBhYC9wDPAu8GhGbU5cmYEh6PgRYBZDmbwAGllmfmZm1rdSAiIjmiGgA6oFxwAdy3dLP3GjhPXfEkDRV0kJJC9etW9dxxZqZ2bt0ymmuEfGqpAeBo4D+knqmUUI9sDp1awKGAk2SegJ7Ay9n1jUbmA3Q2Nj43lsqmZnVQDWnpXa0KVOm8OCDD/LSSy9RX1/PRRddxDnnnLP1BatUWkBIqgPeTuHQFziB4sDzA8ApwA3AGUDLXbbnpek/pvn3R+6eemZmBsDcuXNLXX+ZI4jBwBxJPSh2Zd0UEbdLWg7cIOli4Ang6tT/auCXklZSjBwml1ibmZltRWkBERFLgLGZ9j9RHI9o3b4JOLWseszMbNv4m9RmZtupq+8F39H6HBBmZtuhT58+rF+/vsuGRESwfv16+vTps93r8MX6zMy2Q319PU1NTXTl0+379OlDfX39di/vgDAz2w69evVixIgRtS6jVN7FZGZmWQ4IMzPLckCYmVmWA8LMzLIcEGZmluWAMDOzLAeEmZllOSDMzCzLAWFmZlkOCDMzy3JAmJlZlgPCzMyyHBBmZpblgDAzsywHhJmZZTkgzMwsq7SAkDRU0gOSnpa0TNJXUvsMSc9LWpweEyuW+ZaklZJWSBpfVm1mZrZ1Zd5RbjPw9YhYJKkf8Like9K8SyPih5WdJY0CJgOjgf2BeyUdHBHNJdZoZmZtKG0EERFrImJRev4a8DQwpJ1FTgJuiIg3I+LPwEpgXFn1mZlZ+zrlGISk4cBY4NHU9EVJSyRdI2mf1DYEWFWxWBOZQJE0VdJCSQu78s3Czcx2dqUHhKQ9gd8AX42I/wFmAQcCDcAa4EctXTOLx3saImZHRGNENNbV1ZVUtZmZlRoQknpRhMP1EfFbgIh4MSKaI+KvwFVs2Y3UBAytWLweWF1mfWZm1rYyz2IScDXwdET8uKJ9cEW3k4Gl6fk8YLKk3SWNAEYCC8qqz8zM2lfmWUwfAf4ReErS4tT2bWCKpAaK3Ud/Ac4DiIhlkm4CllOcATXNZzCZmdVOaQEREQ+TP65wRzvLXAJcUlZNZmZWPX+T2szMshwQZmaW5YAwM7MsB4SZmWU5IMzMLMsBYWZmWQ4IMzPLckCYmVmWA8LMzLIcEGZmluWAMDOzLAeEmZllOSDMzCzLAWFmZlkOCDMzy3JAmJlZlgPCzMyyqgoISfdV02ZmZruOdm85KqkPsAcwSNI+bLmF6F7A/iXXZmZmNbS1EcR5wOPA36afLY9bgZ+2t6CkoZIekPS0pGWSvpLaB0i6R9Iz6ec+qV2SLpe0UtISSYft6IszM7Pt125ARMRPImIE8I2IeH9EjEiPMRFxxVbWvRn4ekR8ADgKmCZpFHA+cF9EjATuS9MAJwIj02MqMGv7X5aZme2odncxtYiIf5X0YWB45TIRcV07y6wB1qTnr0l6GhgCnAQck7rNAR4Evpnar4uIAB6R1F/S4LQeMzPrZFUFhKRfAgcCi4Hm1BxAmwHRavnhwFjgUWC/lg/9iFgjad/UbQiwqmKxptTmgDAzq4GqAgJoBEalv+63iaQ9gd8AX42I/5HUZtdM23u2J2kqxS4ohg0btq3lmJlZlar9HsRS4G+2deWSelGEw/UR8dvU/KKkwWn+YGBtam8ChlYsXg+sbr3OiJgdEY0R0VhXV7etJZmZWZWqDYhBwHJJd0ua1/JobwEVQ4Wrgacj4scVs+YBZ6TnZ1CcEdXSfno6m+koYIOPP5iZ1U61u5hmbMe6PwL8I/CUpMWp7dvA94CbJJ0DPAecmubdAUwEVgIbgbO2Y5tmZtZBqj2L6aFtXXFEPEz+uALA8Zn+AUzb1u2YmVk5qj2L6TW2HDDuDfQC/jci9iqrMDMzq61qRxD9KqclTQLGlVKRmZl1Cdt1NdeIuAU4roNrMTOzLqTaXUyfrpjcjeJ7Edv8nQgzM9t5VHsW0ycrnm8G/kJxaQwzM9tFVXsMwqecmpl1M9XeMKhe0u8krZX0oqTfSKovuzgzM6udancx/QL4NVu+1Pa51PZ/yijKzKzSczMPrXUJXcawC57qtG1VexZTXUT8IiI2p8e1gC+EZGa2C6s2IF6S9DlJPdLjc8D6MgszM7PaqjYgzgZOA16guD/DKfhaSWZmu7Rqj0F8FzgjIl6B4r7SwA8pgsPMzHZB1Y4g/q4lHAAi4mWKO8SZmdkuqtqA2E3SPi0TaQRR7ejDzMx2QtV+yP8I+C9JN1NcYuM04JLSqjIzs5qr9pvU10laSHGBPgGfjojlpVZmZmY1VfVuohQIDgUzs25iuy73bWZmuz4HhJmZZTkgzMwsq7SAkHRNuvrr0oq2GZKel7Q4PSZWzPuWpJWSVkgaX1ZdZmZWnTJHENcCEzLtl0ZEQ3rcASBpFDAZGJ2W+ZmkHiXWZmZmW1FaQETEfODlKrufBNwQEW9GxJ+BlcC4smozM7Otq8UxiC9KWpJ2QbV8O3sIsKqiT1NqMzOzGunsgJgFHAg0UFwV9kepXZm+kVuBpKmSFkpauG7dunKqNDOzzg2IiHgxIpoj4q/AVWzZjdQEDK3oWg+sbmMdsyOiMSIa6+p8zyIzs7J0akBIGlwxeTLQcobTPGCypN0ljQBGAgs6szYzM3u30q7IKmkucAwwSFITcCFwjKQGit1HfwHOA4iIZZJuoriUx2ZgWkQ0l1WbmZltXWkBERFTMs1Xt9P/EnyFWDOzLsPfpDYzs6xuf9Ofw6dfV+sSuozH/+X0WpdgZl2IRxBmZpblgDAzsywHhJmZZTkgzMwsywFhZmZZDggzM8tyQJiZWZYDwszMshwQZmaW5YAwM7MsB4SZmWU5IMzMLMsBYWZmWQ4IMzPLckCYmVmWA8LMzLIcEGZmluWAMDOzrNICQtI1ktZKWlrRNkDSPZKeST/3Se2SdLmklZKWSDqsrLrMzKw6ZY4grgUmtGo7H7gvIkYC96VpgBOBkekxFZhVYl1mZlaF0gIiIuYDL7dqPgmYk57PASZVtF8XhUeA/pIGl1WbmZltXWcfg9gvItYApJ/7pvYhwKqKfk2pzczMaqSrHKRWpi2yHaWpkhZKWrhu3bqSyzIz6746OyBebNl1lH6uTe1NwNCKfvXA6twKImJ2RDRGRGNdXV2pxZqZdWedHRDzgDPS8zOAWyvaT09nMx0FbGjZFWVmZrXRs6wVS5oLHAMMktQEXAh8D7hJ0jnAc8CpqfsdwERgJbAROKususzMrDqlBURETGlj1vGZvgFMK6sWMzPbdl3lILWZmXUxpY0gbOfz3MxDa11ClzHsgqdqXYJZzXkEYWZmWQ4IMzPLckCYmVmWA8LMzLIcEGZmluWAMDOzLAeEmZllOSDMzCzLAWFmZlkOCDMzy3JAmJlZlgPCzMyyHBBmZpblgDAzsywHhJmZZTkgzMwsywFhZmZZNbmjnKS/AK8BzcDmiGiUNAC4ERgO/AU4LSJeqUV9ZmZW2xHEsRHREBGNafp84L6IGAncl6bNzKxGutIuppOAOen5HGBSDWsxM+v2ahUQAfxe0uOSpqa2/SJiDUD6uW+NajMzM2p0DAL4SESslrQvcI+k/652wRQoUwGGDRtWVn1mZt1eTUYQEbE6/VwL/A4YB7woaTBA+rm2jWVnR0RjRDTW1dV1VslmZt1OpweEpPdJ6tfyHPi/wFJgHnBG6nYGcGtn12ZmZlvUYhfTfsDvJLVs/9cRcZekx4CbJJ0DPAecWoPazMws6fSAiIg/AWMy7euB4zu7HjMzy+tKp7mamVkX4oAwM7MsB4SZmWU5IMzMLMsBYWZmWQ4IMzPLckCYmVmWA8LMzLIcEGZmluWAMDOzLAeEmZllOSDMzCzLAWFmZlkOCDMzy3JAmJlZlgPCzMyyHBBmZpblgDAzsywHhJmZZTkgzMwsq8sFhKQJklZIWinp/FrXY2bWXXWpgJDUA/gpcCIwCpgiaVRtqzIz6566VEAA44CVEfGniHgLuAE4qcY1mZl1S10tIIYAqyqmm1KbmZl1sp61LqAVZdriXR2kqcDUNPm6pBWlV9VNHACDgJdqXUeXcGHuV9Fqxb+bFTrmd/OAajp1tYBoAoZWTNcDqys7RMRsYHZnFtVdSFoYEY21rsOsNf9u1kZX28X0GDBS0ghJvYHJwLwa12Rm1i11qRFERGyW9EXgbqAHcE1ELKtxWWZm3VKXCgiAiLgDuKPWdXRT3nVnXZV/N2tAEbH1XmZm1u10tWMQZmbWRTgguqGtXc5E0u6SbkzzH5U0vPOrtO5G0jWS1kpa2sZ8Sbo8/V4ukXRYZ9fY3TggupkqL2dyDvBKRBwEXAp8v3OrtG7qWmBCO/NPBEamx1RgVifU1K05ILqfai5nchIwJz2/GThekr85ZqWKiPnAy+10OQm4LgqPAP0lDe6c6ronB0T3U83lTN7pExGbgQ3AwE6pzqxtvhRPJ3NAdD9bvZxJlX3MOpt/LzuZA6L72erlTCr7SOoJ7E37Q3+zzlDN7651IAdE91PN5UzmAWek56cA94e/MGO1Nw84PZ3NdBSwISLW1LqoXVmX+ya1lauty5lImgksjIh5wNXALyWtpBg5TK5dxdZdSJoLHAMMktQEXAj0AoiIKymusDARWAlsBM6qTaXdh79JbWZmWd7FZGZmWQ4IMzPLckCYmVmWA8LMzLIcEGZmluWAMDOzLAdENyNpoKTF6fGCpOcrpnu36nu3pH61qrU1SQ9Lasi0b1edkkZJelLSE21d0lxST0mvtjHvV5ImtbP+r0nqU8V6pkn6bDvrOUHSLe29ls4g6VxJIenjFW2nprZJafoXkg7ZxvWeLGl6R9drO85flOtmImI90AAgaQbwekT8sLJPunKrImJ851e47Xagzk8DN0fEdzuyngpfA64BNrXXKSJ+WtL2y/AUMAV4KE1PBp5smRkR2/zltYj4XceUZh3NIwgDQNJBkpZKuhJYBAyW1CSpf5p/VrpJy5OSfpHa9pP0W0kLJS1Ilz9A0sWS5kh6QNIzks5O7UPSKGBx2taH26ilp6RfSnoq9ftyq/k90l/vM9J0k6T+Fa/haknLJN3Z8hd8ZhufAr4IfEHSvantn9PySyV9KbPMbpJ+Jmm5pNuAQe28n/8E7Av8Z8v6U/v30nv4R0n7VrxfX03PD5Z0f+qzqPXIRtKRLe1puaslPSTpT5KmVfQ7I/2bLE4179bW+yrpn9JrelLSr9p6TcmDwIfTuvYChgHv3OCnZZS3LdtKI5PL0vNfSfqJpP9Kr+nk1N5D0pXp3/U2SXepndGbdQyPIKzSKOCsiPgCgNItICSNAb4JfDgiXpY0IPW/HPhBRDySPshuBz6Y5h0KfBjYC1gk6T+AzwG3RcT3Vdy4qG8bdRwODIqIQ9P2+1fM6wn8GlgUEbkbGR0CTImIpyT9FphEcc+Ld4mIeZLGAS9FxGXp+Wcp7pfRA1gg6SFgecVipwAj0mvcP827MvcCIuJSSV8Hjo6IV7XloocPRcT5kn4MnA18r9Wic4EZEXFbCrfdgIPS+3A0xQ2cPhURTenf52DgeKA/8LSKgP8AcDLFv9dmSbMp/tJ/to339Z+BAyLirVbvdc5fKULiBGA/4Ja0vdba+jesZlv7Ah+h+B26CfgdcCrFpb0PBf4GeJo23nvrOB5BWKVnI+KxTPtxwI0R8TJAy0+KD4krJS2m+KDYR1LLh/4tEbEpItYC84EjKC4UeK6kC4EPRsTrbdSxEjgk/SU5nuJ+FC2upu1wgOJmSE+l548Dw7fymlscDfwmIjZGxGvp9Xy0VZ+PAXMj4q8R0UTxQbkt3oiIO9uqTdI+FB+qtwGk929jmv1B4GfA36dtt7g9It5K7/PLQB3Fv8sRwML0b/Nx4EDafl+XAb9ScRzk7Spexw0UgTOZTPgmO7KtW9JNgZaw5X4PHwVuSu/9arbs4rISOSCs0v+20S7y190XMC4iGtJjSES8kea17h8RcT/FxdjWANerjQOz6TjJ3wEPA18G/q1i9h8o7nC3exu1vlnxvJnqR8nV3jFvRy5e9lbF87Zqa2v9q9PyrQ/S516vKC7C2PLvckhEfLed93U8xV/j4yhCpcdWXscfgcOAvSLi2VyHHdxW5WtSq5/WiRwQVo17gcktu5YqdjHdC1Tu96788JokaXdJgyj+Ol8o6QDghYiYTXH/4bG5jUmqozhI/u8UV/SsvDn97LTdG9Jum44yHzhZUl9Je1Lc3vI/M30mp/35Qyj+Mm/Pa0DVZ1dFxCvAS5I+CSCpj6Q90uyXgb8HfpB2NbXnXuC09N63nLk2LPe+pg/o+hTe0ylGIHu0teJUZwDfAr7dVp+O2laFh4FTVBhMMZqzkvkYhG1VRCyR9ANgvqTNFLtHzqEIh1mSzqL4XXqALYHxGHAnxQ1eLoyIF1UcrP6apLeB1ymOSeQMBa5WsZM9KI5/VNbzA0mXANdKOr2DXuMCFZebbtnFNisdx6j8P3IzcCzFQdkVFIHRntnAvZJWAROqLOWzwL+l1/cW8JmKGteoOLh+R3uvO9V9Udr2bhS7cr5AMcJo/b72BH6t4jTh3YDvp11s7YqI/9hKl9y/YXZbqu525zdR7Opsee8f5d27Hq0Evty3dThJF5MO/ta6Ftt1SNozIl5Po5NHgSMjYl2t69qVeQRhZjuLO9Optb0oRqUOh5J5BGE1JWkh7/1D5R8iYnmu/3Zu40rgqFbNP46I6zpo/fMovg9Q6RsRcW+uf1cn6VyK74hUmh8RX871t12XA8LMzLJ8FpOZmWU5IMzMLMsBYWZmWQ4IMzPLckCYmVnW/wcFBxlOo2Ke2QAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "sns.countplot(x = 'Triceps_skin_fold_thickness_Missing', hue = 'Target', data = X)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAELCAYAAADDZxFQAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAFhlJREFUeJzt3X2wXHWd5/H31zwYHrJAQmAIF0iAwG4iJsAl6CoOAjXEOBJQZEitAoKEmUUXpwZqGNeCmJFdp0ZFGWbYAmQJM8jDokJgKR5kB1O6AyEghgCDRGHJhUBCxAgTAuTy3T/6XGjCL0kn3HP73vT7VdXV5/z6d875dlfX/dzz9OvITCRJ2tD72l2AJGlwMiAkSUUGhCSpyICQJBUZEJKkIgNCklRkQEiSigwISVKRASFJKhre7gLei1133TUnTJjQ7jIkaUh58MEHX8zMcZvrN6QDYsKECSxevLjdZUjSkBIR/6+Vfh5ikiQVGRCSpCIDQpJUNKTPQUhSu7zxxhv09PSwbt26dpeyUaNGjaKrq4sRI0Zs1fIGhCRthZ6eHkaPHs2ECROIiHaX8y6ZyerVq+np6WHixIlbtQ4PMUnSVli3bh1jx44dlOEAEBGMHTv2Pe3hGBCStJUGazj0ea/1GRCSpCLPQUhSP1i9ejVHH300AM8//zzDhg1j3LjGzcqLFi1i5MiR/b7Nhx56iJUrVzJjxox+XzcYENKgdeh517S7hEHjwb89pd0lbNbYsWN5+OGHAZg7dy477rgj5557bsvL9/b2MmzYsC3a5kMPPcTSpUtrCwgPMUlSzT71qU9x6KGHMmXKFK688koA1q9fz84778zXvvY1pk+fzqJFi1iwYAEHHnggRxxxBF/+8pc5/vjjAXjllVc47bTTmD59OgcffDC33norr776KvPmzePaa69l2rRp3HTTTf1et3sQklSz+fPnM2bMGNauXUt3dzef+cxnGD16NGvWrOGQQw7hG9/4BmvXruWAAw7g5z//OXvvvTcnnXTSW8vPmzePGTNmcPXVV/PSSy9x+OGHs2TJEi644AKWLl3Kd7/73Vrqdg9Ckmp28cUXM3XqVD784Q/T09PDr3/9awBGjhzJCSecAMBjjz3GgQceyD777ENEMHv27LeWv+uuu7jooouYNm0aH//4x1m3bh3PPPNM7XW7ByFJNfrJT37CwoULue+++9huu+346Ec/+ta9Cdttt91bl6Jm5kbXkZncfPPN7Lfffu9oX7hwYX2F4x6EJNVqzZo1jBkzhu22245HH32UBx54oNhvypQpPPHEEyxfvpzM5IYbbnjrtWOPPZZLLrnkrflf/OIXAIwePZqXX365ttoNCEmq0Sc/+UnWrl3L1KlTmTdvHocffnix3/bbb8+ll17KMcccwxFHHMH48ePZaaedALjwwgtZu3YtBx10EFOmTGHu3LkAHHXUUfzyl7/k4IMPHlonqSNiL+Aa4A+AN4HLM/N7ETEXOBNYVXX9ambeXi3zV8AZQC/wXzLzzrrqk6S69P0Bh8aAeXfeWf5T9rvf/e4d88cccwxPPPEEmclZZ51Fd3c3ADvssANXXHHFu5YfN25crT+aVuc5iPXAX2TmQxExGngwIu6uXrs4M7/V3DkiJgMnA1OA8cBPIuKAzOytsUZJGjQuu+wyrr32Wl577TW6u7s588wz21pPbQGRmSuAFdX0yxHxOLDnJhaZBVyfma8BT0XEMmA68C911ShJg8l5553Heeed1+4y3jIg5yAiYgJwMHB/1fSliFgSEVdFxC5V257A8qbFeth0oEiSalR7QETEjsAPga9k5u+By4D9gGk09jC+3de1sPi7rvuKiDkRsTgiFq9ataqwiCSpP9QaEBExgkY4XJuZPwLIzBcyszcz3wSuoHEYCRp7DHs1Ld4FPLfhOjPz8szszszuvoGwJEn9r7aAiMbdH98HHs/M7zS179HU7QRgaTW9ADg5It4fEROBScCiuuqTJG1anVcxfQT4PPBIRDxctX0VmB0R02gcPnoaOAsgMx+NiBuBx2hcAXW2VzBJGir6e/TdVkawveOOOzjnnHPo7e3li1/8Iueff36/1lDnVUw/o3xe4fZNLHMRcFFdNUnStqK3t5ezzz6bu+++m66uLg477DCOO+44Jk+e3G/b8E5qSRqCFi1axP7778++++7LyJEjOfnkk7nlllv6dRsGhCQNQc8++yx77fX2dT1dXV08++yz/boNA0KShqDS6K99I8P2FwNCkoagrq4uli9/+97inp4exo8f36/bMCAkaQg67LDDePLJJ3nqqad4/fXXuf766znuuOP6dRv+YJAk9YNWLkvtT8OHD+fSSy/l2GOPpbe3l9NPP50pU6b07zb6dW1DUH9fuzyUDfQXXNJ7M3PmTGbOnFnb+j3EJEkqMiAkSUUGhCSpyICQJBUZEJKkIgNCklTU8Ze5SlJ/eGbeQf26vr0veGSzfU4//XRuu+02dtttN5YuXbrZ/lvKPQhJGqJOO+007rjjjtrWb0BI0hD1sY99jDFjxtS2fgNCklRkQEiSigwISVKRASFJKvIyV0nqB61cltrfZs+ezb333suLL75IV1cXX//61znjjDP6bf0GhCQNUdddd12t6/cQkySpyICQJBUZEJK0lTKz3SVs0nutz4CQpK0watQoVq9ePWhDIjNZvXo1o0aN2up1eJJakrZCV1cXPT09rFq1qt2lbNSoUaPo6ura6uUNCEnaCiNGjGDixIntLqNWHmKSJBUZEJKkotoCIiL2ioh/jojHI+LRiDinah8TEXdHxJPV8y5Ve0TEJRGxLCKWRMQhddUmSdq8Ovcg1gN/kZn/AfgQcHZETAbOB+7JzEnAPdU8wCeASdVjDnBZjbVJkjajtoDIzBWZ+VA1/TLwOLAnMAuYX3WbDxxfTc8CrsmG+4CdI2KPuuqTJG3agJyDiIgJwMHA/cDumbkCGiEC7FZ12xNY3rRYT9UmSWqD2gMiInYEfgh8JTN/v6muhbZ33YESEXMiYnFELB7M1x9L0lBXa0BExAga4XBtZv6oan6h79BR9byyau8B9mpavAt4bsN1Zublmdmdmd3jxo2rr3hJ6nB1XsUUwPeBxzPzO00vLQBOraZPBW5paj+luprpQ8CavkNRkqSBV+ed1B8BPg88EhEPV21fBb4J3BgRZwDPAJ+tXrsdmAksA9YCX6ixNknSZtQWEJn5M8rnFQCOLvRP4Oy66pEkbRnvpJYkFRkQkqQiA0KSVGRASJKKDAhJUpEBIUkqMiAkSUUGhCSpyICQJBUZEJKkIgNCklRkQEiSigwISVKRASFJKjIgJElFBoQkqciAkCQVGRCSpCIDQpJUZEBIkooMCElSkQEhSSoyICRJRQaEJKnIgJAkFRkQkqQiA0KSVGRASJKKDAhJUpEBIUkqqi0gIuKqiFgZEUub2uZGxLMR8XD1mNn02l9FxLKIeCIijq2rLklSa1oKiIi4p5W2DVwNzCi0X5yZ06rH7dW6JgMnA1OqZf4hIoa1UpskqR6bDIiIGBURY4BdI2KXiBhTPSYA4ze1bGYuBH7bYh2zgOsz87XMfApYBkxvcVlJUg02twdxFvAg8O+r577HLcDfb+U2vxQRS6pDULtUbXsCy5v69FRtkqQ22WRAZOb3MnMicG5m7puZE6vH1My8dCu2dxmwHzANWAF8u2qP0uZLK4iIORGxOCIWr1q1aitKkCS1YngrnTLz7yLiPwITmpfJzGu2ZGOZ+ULfdERcAdxWzfYAezV17QKe28g6LgcuB+ju7i6GiCTpvWspICLiH2n85/8w0Fs1J7BFARERe2Tmimr2BKDvCqcFwA8i4js0zm1MAhZtybolSf2rpYAAuoHJmdnyf+wRcR1wJI0T3D3AhcCRETGNRrg8TeMcB5n5aETcCDwGrAfOzsze0nolSQOj1YBYCvwBjfMGLcnM2YXm72+i/0XARa2uX5JUr1YDYlfgsYhYBLzW15iZx9VSlSSp7VoNiLl1FiFJGnxavYrpp3UXIkkaXFq9iull3r4vYSQwAvi3zPx3dRUmSWqvVvcgRjfPR8TxOBSGJG3TWj0H8Q6ZeXNEnN/fxai9npl3ULtLGDT2vuCRdpcgtV2rh5g+3TT7Phr3RXgXsyRtw1rdg/hU0/R6Gje5zer3aiRJg0ar5yC+UHchkqTBpdUfDOqKiB9XvxD3QkT8MCK66i5OktQ+rf7k6P+kMaDeeBq/03Br1SZJ2ka1eg5iXGY2B8LVEfGVOgqSpA15hd3bBvIKu1b3IF6MiM9FxLDq8TlgdZ2FSZLaq9WAOB04CXiexoiuJwKeuJakbVirh5j+Gjg1M18CiIgxwLdoBIckaRvU6h7EB/vCASAzfwscXE9JkqTBoNWAeF9E7NI3U+1BbNUwHZKkoaHVP/LfBv5vRNxEY4iNk/DX3yRpm9bqndTXRMRi4CgggE9n5mO1ViZJaquWDxNVgWAoSFKHaPUchCSpwxgQkqQiA0KSVGRASJKKDAhJUpEBIUkqMiAkSUUGhCSpyICQJBUZEJKkIgNCklRUW0BExFURsTIilja1jYmIuyPiyep5l6o9IuKSiFgWEUsi4pC66pIktabOPYirgRkbtJ0P3JOZk4B7qnmATwCTqscc4LIa65IktaC2gMjMhcBvN2ieBcyvpucDxze1X5MN9wE7R8QeddUmSdq8gT4HsXtmrgConner2vcEljf166na3iUi5kTE4ohYvGrVqlqLlaRONlhOUkehLUsdM/PyzOzOzO5x48bVXJYkda6BDogX+g4dVc8rq/YeYK+mfl3AcwNcmySpyUAHxALg1Gr6VOCWpvZTqquZPgSs6TsUJUlqj5Z/cnRLRcR1wJHArhHRA1wIfBO4MSLOAJ4BPlt1vx2YCSwD1gJfqKsuSVJraguIzJy9kZeOLvRN4Oy6apEkbbnBcpJakjTIGBCSpCIDQpJUZEBIkooMCElSkQEhSSoyICRJRQaEJKnIgJAkFRkQkqQiA0KSVGRASJKKDAhJUpEBIUkqMiAkSUUGhCSpyICQJBUZEJKkIgNCklRkQEiSigwISVKRASFJKjIgJElFBoQkqciAkCQVGRCSpCIDQpJUZEBIkooMCElSkQEhSSoa3o6NRsTTwMtAL7A+M7sjYgxwAzABeBo4KTNfakd9kqT27kF8PDOnZWZ3NX8+cE9mTgLuqeYlSW0ymA4xzQLmV9PzgePbWIskdbx2BUQCd0XEgxExp2rbPTNXAFTPu7WpNkkSbToHAXwkM5+LiN2AuyPiX1tdsAqUOQB77713XfVJUsdryx5EZj5XPa8EfgxMB16IiD0AqueVG1n28szszszucePGDVTJktRxBjwgImKHiBjdNw38EbAUWACcWnU7FbhloGuTJL2tHYeYdgd+HBF92/9BZt4REQ8AN0bEGcAzwGfbUJskqTLgAZGZvwGmFtpXA0cPdD2SpLLBdJmrJGkQMSAkSUUGhCSpyICQJBUZEJKkIgNCklRkQEiSigwISVKRASFJKjIgJElFBoQkqciAkCQVGRCSpCIDQpJUZEBIkooMCElSkQEhSSoyICRJRQaEJKnIgJAkFRkQkqQiA0KSVGRASJKKDAhJUpEBIUkqMiAkSUUGhCSpyICQJBUZEJKkIgNCklQ06AIiImZExBMRsSwizm93PZLUqQZVQETEMODvgU8Ak4HZETG5vVVJUmcaVAEBTAeWZeZvMvN14HpgVptrkqSONNgCYk9gedN8T9UmSRpgw9tdwAai0Jbv6BAxB5hTzb4SEU/UXlWH2Ad2BV5sdx2DwoWlr6Laxe9mk/75bu7TSqfBFhA9wF5N813Ac80dMvNy4PKBLKpTRMTizOxudx3ShvxutsdgO8T0ADApIiZGxEjgZGBBm2uSpI40qPYgMnN9RHwJuBMYBlyVmY+2uSxJ6kiDKiAAMvN24PZ219GhPHSnwcrvZhtEZm6+lySp4wy2cxCSpEHCgOhAmxvOJCLeHxE3VK/fHxETBr5KdZqIuCoiVkbE0o28HhFxSfW9XBIRhwx0jZ3GgOgwLQ5ncgbwUmbuD1wM/M3AVqkOdTUwYxOvfwKYVD3mAJcNQE0dzYDoPK0MZzILmF9N3wQcHRHeOaZaZeZC4Leb6DILuCYb7gN2jog9Bqa6zmRAdJ5WhjN5q09mrgfWAGMHpDpp4xyKZ4AZEJ1ns8OZtNhHGmh+LweYAdF5NjucSXOfiBgO7MSmd/2lgdDKd1f9yIDoPK0MZ7IAOLWaPhH4P+kNM2q/BcAp1dVMHwLWZOaKdhe1LRt0d1KrXhsbziQi5gGLM3MB8H3gHyNiGY09h5PbV7E6RURcBxwJ7BoRPcCFwAiAzPwfNEZYmAksA9YCX2hPpZ3DO6klSUUeYpIkFRkQkqQiA0KSVGRASJKKDAhJUpEBIUkqMiCkSkT8aUSc0s/rvDoiTqymryyMnNvKOuZGREbE/k1tf161dVfzt0fEzlu43n5/v9q2eKOchpyIGF4NItivqpuxapOZX3wPiz9C44bFb1TzJwKPNa175lbUU+v71dDnHoTaJiJ2iIj/HRG/jIilEfEnEXFoRPw0Ih6MiDv7hnOOiHsj4r9FxE+Bc5r/M69ef6V6PrJa/saI+FVEfDMi/lNELIqIRyJiv03UMzcizm3a3t9Uy/0qIo6o2qdUbQ9XP1ozKSImNP/ITUScGxFzC+u/t+k//lci4qLqvd8XEbtv5uO6mWpY9ojYl8YIu6ua1v10ROxa+kyr178ZEY9VNX9rC97v9tVnuaT6Ean7+96Dtn0GhNppBvBcZk7NzA8AdwB/B5yYmYcCVwEXNfXfOTP/MDO/vZn1TgXOAQ4CPg8ckJnTgSuBL29BfcOr5b5CY9gHgD8FvpeZ04BuGgPIbY0dgPsycyqwEDhzM/1/DyyPiA8As4EbNtLvXZ9pRIwBTgCmZOYHeXsvZEOl9/ufafx41AeBvwYObe3taVtgQKidHgGOqf5zPYLGSJ0fAO6OiIeBr9EYsbPPxv4obuiBzFyRma8BvwbuatrehC2o70fV84NNy/0L8NWI+Etgn8x8dQvW1+x14LbC+jflehqHmY4HfryRPu/4TDNzDY1wWQdcGRGfpjGOUUnp/X602i6ZuRRY0kKd2kYYEGqbzPwVjf9IHwH+O/AZ4NHMnFY9DsrMP2pa5N+aptdTfX+rX7sb2fTaa03TbzbNv8mWnXfrW663b7nM/AFwHPAqcGdEHNVcS2VUC+t+o2mE3LfWvxm30tgjeiYzf1/qsOFnGhEXVOdrpgM/pBEud2xk/e96v5R/g0EdwoBQ20TEeGBtZv4T8C3gcGBcRHy4en1EREzZyOJP8/bhjllUo37WrTr+/5vMvITG8NMfBF4AdouIsRHxfuCP69h2tbfyl7zzsNuG9W34mR4SETsCO2Xm7TQOH03bgs3+DDipWvdkGoft1CG8ikntdBDwtxHxJvAG8Gc0/hu/JCJ2ovH9/C7waGHZK4BbImIRcA/v3Luo058An4uIN4DngXmZ+UY1XPr9wFPAv9a18cy8fjNdSp/paBqf1SgaewR/vgWb/AdgfkQsAX5B4xDTmi0uXEOSw31L2qiIGAaMyMx11RVg99A46f96m0vTAHAPQtKmbA/8c0SMoLH38WeGQ+dwD0IdJyL+K/DZDZr/V2Zu9Nj+QBisdalzGRCSpCKvYpIkFRkQkqQiA0KSVGRASJKKDAhJUtH/B7iQBLLrnjvRAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "sns.countplot(x = 'serum_insulin_Missing', hue = 'Target', data = X)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 缺失值与目标值之间貌似没有什么关系，是随机值缺失，最后使用中位数去填充缺失值"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 填充缺失值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pregnants                              0\n",
       "Plasma_glucose_concentration           0\n",
       "blood_pressure                         0\n",
       "Triceps_skin_fold_thickness            0\n",
       "serum_insulin                          0\n",
       "BMI                                    0\n",
       "Diabetes_pedigree_function             0\n",
       "Age                                    0\n",
       "Target                                 0\n",
       "Triceps_skin_fold_thickness_Missing    0\n",
       "serum_insulin_Missing                  0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "medians = X.median()\n",
    "X = X.fillna(medians)\n",
    "X.isnull().sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据标准化 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\jack-pc\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages\\sklearn\\preprocessing\\data.py:625: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by StandardScaler.\n",
      "  return self.partial_fit(X, y)\n",
      "c:\\users\\jack-pc\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages\\sklearn\\base.py:462: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by StandardScaler.\n",
      "  return self.fit(X, **fit_params).transform(X)\n"
     ]
    }
   ],
   "source": [
    "y_train = X['Target']\n",
    "X_train = X.drop(['Target','Triceps_skin_fold_thickness_Missing','serum_insulin_Missing'],axis = 1)\n",
    "feature_name = X_train.columns\n",
    "\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "ss_X = StandardScaler()\n",
    "X_train = ss_X.fit_transform(X_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 保存成新文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants</th>\n",
       "      <th>Plasma_glucose_concentration</th>\n",
       "      <th>blood_pressure</th>\n",
       "      <th>Triceps_skin_fold_thickness</th>\n",
       "      <th>serum_insulin</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Diabetes_pedigree_function</th>\n",
       "      <th>Age</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.639947</td>\n",
       "      <td>0.866045</td>\n",
       "      <td>-0.031990</td>\n",
       "      <td>0.670643</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>0.166619</td>\n",
       "      <td>0.468492</td>\n",
       "      <td>1.425995</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>-1.205066</td>\n",
       "      <td>-0.528319</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>-0.852200</td>\n",
       "      <td>-0.365061</td>\n",
       "      <td>-0.190672</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.233880</td>\n",
       "      <td>2.016662</td>\n",
       "      <td>-0.693761</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>-1.332500</td>\n",
       "      <td>0.604397</td>\n",
       "      <td>-0.105584</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>-1.073567</td>\n",
       "      <td>-0.528319</td>\n",
       "      <td>-0.695245</td>\n",
       "      <td>-0.540642</td>\n",
       "      <td>-0.633881</td>\n",
       "      <td>-0.920763</td>\n",
       "      <td>-1.041549</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-1.141852</td>\n",
       "      <td>0.504422</td>\n",
       "      <td>-2.679076</td>\n",
       "      <td>0.670643</td>\n",
       "      <td>0.316566</td>\n",
       "      <td>1.549303</td>\n",
       "      <td>5.484909</td>\n",
       "      <td>-0.020496</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pregnants  Plasma_glucose_concentration  blood_pressure  \\\n",
       "0   0.639947                      0.866045       -0.031990   \n",
       "1  -0.844885                     -1.205066       -0.528319   \n",
       "2   1.233880                      2.016662       -0.693761   \n",
       "3  -0.844885                     -1.073567       -0.528319   \n",
       "4  -1.141852                      0.504422       -2.679076   \n",
       "\n",
       "   Triceps_skin_fold_thickness  serum_insulin       BMI  \\\n",
       "0                     0.670643      -0.181541  0.166619   \n",
       "1                    -0.012301      -0.181541 -0.852200   \n",
       "2                    -0.012301      -0.181541 -1.332500   \n",
       "3                    -0.695245      -0.540642 -0.633881   \n",
       "4                     0.670643       0.316566  1.549303   \n",
       "\n",
       "   Diabetes_pedigree_function       Age  Target  \n",
       "0                    0.468492  1.425995       1  \n",
       "1                   -0.365061 -0.190672       0  \n",
       "2                    0.604397 -0.105584       1  \n",
       "3                   -0.920763 -1.041549       0  \n",
       "4                    5.484909 -0.020496       1  "
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train = pd.DataFrame(data = X_train, columns = feature_name)\n",
    "X = pd.concat([X_train,y_train], axis = 1)\n",
    "X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [],
   "source": [
    "X.to_csv('data/FE_pima-indians-diabetes.csv', index = False, header = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
